This notebook is to explore Canadian SARS-CoV-2 genomic and epidemiological data, for discussion with pillar 6’s team and for sharing with collaborators. These analyses can spur further research within or across pillars, be used for reports (or data dashboards), given to the science communication pillar for public dissemination, and the code can be repackaged to give to public health authorities for their internal use.
Canadian genomic and epidemiological data will be regularly pulled from various public sources to keep these analyses up-to-date. We would like to acknoweldge all Canadian laboratories across the country that are making these data publically available. Only representations of aggregate data will be posted here. Below is a list of data sources and lab authors.
## 1. LOAD processed metadata of Canadian sequences (with latest pangolin, division, and full seq IDs)
#Download metadata from gisaid, put the date here:
gisaiddate="2022_01_17"
#date=2022_01_17
#tar -axf metadata_tsv_$date.tar.xz metadata.tsv -O | tr ' ' '_' | sed 's/\t\t/\tNA\t/g' | sed 's/\t\t/\tNA\t/g' | sed 's/\t$/\tNA/g' | awk 'NR==1 || substr($1,9,6)=="Canada" && $8=="Human"' | sort -k3,3 > metadata_CANall_$date.uncorrected.csv
#cat metadata_virrusseq_2022_01_17.tsv | tr ' ' '_' | sed 's/\t\t/\tNA\t/g' | sed 's/\t\t/\tNA\t/g' | sed 's/\t$/\tNA/g' | awk 'NR!=1 && $43!="NA"' | cut -f5,43 | sort -k2,2 > epidatesfromvirrusseq
#join -1 3 metadata_CANall_$date.uncorrected.csv -a 1 -2 2 epidatesfromvirrusseq | awk '$4!=$23 && length($4)<10 && length($23)==10{$4=$23} {id=$1;$1=$2;$2=$3;$3=id} {print}' | tr ' ' '\t'| cut -f-22 > metadata_CANall_$date.csv
#zip CoVaRRNet_pillar6notebook/data_needed/metadata_CANall_$date.zip metadata_CANall_$date.csv
metaCANall <- read.csv(unz(paste("./data_needed/metadata_CANall_",gisaiddate,".zip",sep=""),paste("metadata_CANall_",gisaiddate,".csv",sep="")),sep="\t",row.names=NULL)
metaCANall$Collection_date <- as.Date(metaCANall$Collection_date)
#max(metaCANall$Collection.date) - min(metaCANall$Collection.date) #time diff: 580 days
#make a pango.group column
metaCANall$pango.group <- metaCANall$Variant
metaCANall$pango.group[is.na(metaCANall$pango.group)]<-"other"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Alpha_202012/01_GRY_(B.1.1.7+Q.x)_first_detected_in_the_UK"]<-"Alpha"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Beta_GH/501Y.V2_(B.1.351+B.1.351.2+B.1.351.3)_first_detected_in_South_Africa" ]<-"Beta"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Gamma_GR/501Y.V3_(P.1+P.1.x)_first_detected_in_Brazil/Japan" ]<-"Gamma"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Delta_GK/478K.V1_(B.1.617.2+AY.x)_first_detected_in_India" ]<-"Delta"
metaCANall$pango.group[metaCANall$Pango_lineage == "AY.25" ]<-"Delta AY.25"
metaCANall$pango.group[metaCANall$Pango_lineage == "AY.27" ]<-"Delta AY.27"
metaCANall$pango.group[metaCANall$pango.group == "VOI_Lambda_GR/452Q.V1_(C.37+C.37.1)_first_detected_in_Peru" ]<-"Lambda"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Omicron_GRA_(B.1.1.529+BA.*)_first_detected_in_Botswana/Hong_Kong/South_Africa" ]<-"Omicron"
metaCANall$pango.group[metaCANall$pango.group == "VOI_Mu_GH_(B.1.621+B.1.621.1)_first_detected_in_Colombia" ]<-"Mu"
metaCANall$pango.group[str_detect(metaCANall$pango.group,"V") ]<-"other"
metaCANall$pango.group[metaCANall$Pango_lineage == "A.23.1" ]<-"A.23.1"
metaCANall$pango.group[metaCANall$Pango_lineage == "B.1.438.1" ]<-"B.1.438.1"
metaCANall$pango.group[grepl("B\\.1\\.1\\.529|BA\\.", metaCANall$Pango_lineage)] <- "Omicron"
## 2. LOAD epidemiological data (PHAC)
epidataCANall <- read.csv(url("https://health-infobase.canada.ca/src/data/covidLive/covid19-download.csv"))
#from: https://health-infobase.canada.ca/covid-19/epidemiological-summary-covid-19-cases.html?stat=num&measure=total&map=pt#a2
epidate = tail(epidataCANall,1)$date
#epidataAlberta <- epidataCANall[ which(epidataCANall$prnameFR=='Alberta'), ]
#print(epidataAlberta$numtoday)
Sequence counts for all Canadian data in GISAID, up to 2022-01-20, shows that by end of summer Alpha and Gamma were still the most sequenced variants. Note that some of the “other” category include some Delta sublineages (AYs) but overall, from the beginning of the pandemic to the fall of 2021, Canadian sequences were mostly of the wildtype lineages (pre-VOCs).
# --- Histogram plot: counts per variant of all canadian data -------------
listpango <- unique(metaCANall$pango.group)
print(listpango)
## [1] "other" "B.1.438.1" "A.23.1" "Alpha" "Beta"
## [6] "Gamma" "Delta" "Lambda" "Delta AY.27" "Delta AY.25"
## [11] "Mu" "Omicron"
listpango <- listpango[order(listpango)]
getpal <- colorRampPalette(brewer.pal(9, "Paired")) #"Set3
pal <- getpal(length(listpango))
names(pal) <- listpango
pal["Omicron"]="#E52823"
pal["other"]="grey"
pal["Alpha"]="#B29C71"
pal["Delta"]="#A6CEE3"
pal["Delta AY.27"]="#438FC0"
pal["Delta AY.25"]="#61A6A0"
pal["A.23.1"]="#9AD378"
pal["B.1.438.1"]="#3EA534"
pal["Beta"]="#CAB2D6"
pal["Gamma"]="#444444"
pal["Lambda"]="#F08C3A"
pal["Mu"]="#F08C3A"
#print(pal)
#------------- counts over time (by week)
plot_metadatadf <- function(meta_tab,pal) {
meta_tab <- filter(meta_tab, !is.na(meta_tab$Collection_date))
df1 <- meta_tab %>% group_by(pango.group) %>% group_by(week = cut(Collection_date, "week")) #adds week coloum
dfcount <- df1 %>% group_by(week) %>% count(pango.group)
#plot
ggplot(dfcount, aes(x=as.Date(week), y= n, fill=pango.group))+geom_bar(stat = "identity")+
scale_fill_manual(breaks=listpango, values = pal)+ ylab("")+xlab("")+
scale_x_date(date_breaks = "28 day")+
theme_classic()+theme(axis.text.x = element_text(angle=90, vjust=0.1,hjust=0.1))
}
plot_metadatadf_freq <- function(meta_tab,pal) {
meta_tab <- filter(meta_tab, !is.na(meta_tab$Collection_date))
df1 <- meta_tab %>% group_by(pango.group) %>% group_by(week = cut(Collection_date, "week")) #adds week coloum
dfcount <- df1 %>% group_by(week) %>% count(pango.group)
dfcount %>% mutate(freq = prop.table(n)) -> dfprop
#plot
ggplot(dfprop, aes(x= as.Date(week), y=freq, fill=pango.group))+geom_bar(stat = "identity")+
scale_fill_manual(breaks=listpango, values = pal)+ ylab("")+xlab("")+
scale_x_date(date_breaks = "28 day")+
theme_classic()+theme(axis.text.x = element_text(angle=90, vjust=0.1,hjust=0.1))
}
plot_metadatadf(metaCANall,pal)
plot_metadatadf_freq(metaCANall,pal)
There are two PANGO lineages that have a Canadian origin and have predominately spread within Canada (with some exportations internationally): B.1.438.1 and B.1.1.176.
Other lineages of Canadian interest:
Here we show the same plots but for each provinces
loc=sapply(strsplit(metaCANall$Location,"_/_"), `[`, 3)
metaCANall$province <- loc
print(unique(metaCANall$province))
## [1] "Quebec" "Ontario"
## [3] "Newfoundland_and_Labrador" "Nova_Scotia"
## [5] "New_Brunswick" "Manitoba"
## [7] "Alberta" "British_Columbia"
## [9] "Saskatchewan" NA
## [11] "Newfoundland"
plot_metadatadf(metaCANall[ which(metaCANall$province=='British_Columbia'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='British_Columbia'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Alberta'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Alberta'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Saskatchewan'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Saskatchewan'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Manitoba'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Manitoba'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Ontario'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Ontario'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Quebec'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Quebec'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Nova_Scotia'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Nova_Scotia'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='New_Brunswick'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='New_Brunswick'), ],pal)
subtab=metaCANall[ which(metaCANall$province=='Newfoundland' | metaCANall$province=='Newfoundland_and_Labrador' ), ]
plot_metadatadf(subtab,pal)
plot_metadatadf_freq(subtab,pal)
Down-sampled Canadian SARS-CoV-2 genomes. Taken from GISAID Sept. 12th, 2021. Alignment GISAID, ML tree in IQ-tree. This tree was generated using TreeTime and visualized in ggtree.
This tree shows several features of VOC spread in Canada:
These last two points are suggestive of strain (variant) replacement, i.e. competitive exclusion, but more detailed analyses would be required to clarify this.
Divergence tree from TreeTime run, visualized in ggtree.
There are various methods to investigate changes in evolutionary rates of VOC/VOIs and compare their relative fitness in an epidemiological context.
For example, root-to-tip plots give estimates of substitution rates. Clusters with stronger positive slopes than the average SARS-CoV-2 dataset, are an indication that they are accumlating mutations at a faster pace, or clusters that jump up above the average could indicate a mutational jump (simlar to what we saw with Alpha when it first appeared in the UK).
Maximum likelihood tree (IQ-TREE) processed with root-to-tip regression and plotting in R.
rooted <- read.tree('./data_needed/msa_0908_CANall_downsamp10perday_1.rtt.nwk')
get.dates <- function(phy, delimiter='_', pos=-1, format='%Y-%m-%d') {
dt <- sapply(phy$tip.label, function(x) {
tokens <- strsplit(x, delimiter)[[1]]
if (pos < 0) { return(tokens[length(tokens)+pos+1]) }
else { return(tokens[pos]) }
})
as.Date(dt, format=format)
}
tip.dates <- get.dates(rooted, pos=-2)
# total branch length from root to each tip
div <- node.depth.edgelength(rooted)[1:Ntip(rooted)]
# match tips to metadata to retrieve PANGO lineage assignments
accno <- gsub(".+_(EPI_ISL_[0-9]+)_.+", "\\1", rooted$tip.label)
index <- match(accno, metaCANall$Accession_ID)
pg <- metaCANall$pango.group[index]
blobs <- function(x, y, col, cex=1) {
points(x, y, pch=21, cex=cex)
points(x, y, bg=col, col=rgb(0,0,0,0), pch=21, cex=cex)
}
dlines <- function(x, y, col) {
lines(x, y, lwd=2.5)
lines(x, y, col=col)
}
vocs <- c('Alpha', 'Beta', 'Gamma', 'Delta', 'Omicron')
pal <- hcl.colors(n=length(vocs), palette="Sunset")
par(mar=c(5,5,0,1))
plot(tip.dates, div, type='n', las=1, cex.axis=0.6, cex.lab=0.7, bty='n',
xaxt='n', xlab="Sample collection date", ylab="Divergence from root")
xx <- floor_date(seq(min(tip.dates), max(tip.dates), length.out=5), unit="months")
axis(side=1, at=xx, label=format(xx, "%b %Y"), cex.axis=0.6)
blobs(tip.dates[pg=='other'], div[pg=='other'], col='grey', cex=0.8)
fit0 <- rlm(div[pg=='other'] ~ tip.dates[pg=='other'])
abline(fit0, col='gray50')
fits <- list(other=fit0)
for (i in 1:length(vocs)) {
variant <- vocs[i]
x <- tip.dates[pg==variant]
if (all(is.na(x))) next
y <- div[pg==variant]
blobs(x, y, col=pal[i], cex=0.8)
fit <- rlm(y ~ x)
dlines(fit$x[,2], predict(fit), col=pal[i])
fits[[variant]] <- fit
}
legend(x=min(tip.dates), y=max(div), legend=vocs, pch=21, pt.bg=pal, bty='n', cex=0.8)
ci <- lapply(fits, confint.default)
kable(data.frame(
n=sapply(fits, function(f) nrow(f$x)),
est=29970*sapply(fits, function(f) f$coef[2]),
lower.95=29970*sapply(ci, function(f) f[2,1]),
upper.95=29970*sapply(ci, function(f) f[2,2])
),
col.names = c("Number of genomes", "Estimate", "Lower 95% CI", "Upper 95% CI"),
format="html", align="rrrr", caption="Molecular clock rates (subs/genome/day)",
format.args = list(scientific = FALSE), digits=4, table.attr = "style='width:100%;'")
| Number of genomes | Estimate | Lower 95% CI | Upper 95% CI | |
|---|---|---|---|---|
| other | 3284 | 0.0478 | 0.0464 | 0.0492 |
| Alpha | 628 | 0.0377 | 0.0312 | 0.0442 |
| Beta | 16 | 0.0126 | -0.0046 | 0.0299 |
| Gamma | 289 | 0.0597 | 0.0508 | 0.0686 |
| Delta | 181 | 0.0671 | 0.0489 | 0.0853 |
Overall, the evolutionary rate among genomes sampled in Canada (0.0678102 subs/genome/day, grey line) is close to the global average of 0.0674941 subs/genome/day. Compared to other lineages sampled in Canada, variant of concern Alpha (B.1.1.7) exhibited a slightly but significantly lower rate of evolution. Both variants of concern Gamma (P.1) and Delta (B.1.617.2) exhibited higher rates, although only Gamma was significantly higher.
Add here:
Are there any BEAST analyses we’d like to do? e.g. infer R0, serial interval, etc for the different Delta sublineages (might have to restrict this geographically to specific PTs with enough coverage)
Collect a list of bioinformatics, phylogenetic, and modelling tools that are useful for SARS-CoV-2 analyses:
We thank all the authors, developers, and contributors to the GISAID and VirusSeq database for making their SARS-CoV-2 sequences publicly available. We thank especially the Canadian Public Health Laboratory Network, academic sequencing partners, diagnostic hospital labs, and other sequencing partners for the provision of the Canadian sequence data used in this work. Genome sequencing in Canada was supported by a Genome Canada grant to the Canadian COVID-19 Genomic Network (CanCOGeN).
The version numbers of all packages in the current environment as well as information about the R install is reported below.
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Canada.1252 LC_CTYPE=English_Canada.1252
## [3] LC_MONETARY=English_Canada.1252 LC_NUMERIC=C
## [5] LC_TIME=English_Canada.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] MASS_7.3-54 viridis_0.6.1 viridisLite_0.4.0 colorspace_2.0-2
## [5] RColorBrewer_1.1-2 kableExtra_1.3.4 gridExtra_2.3 ggbeeswarm_0.6.0
## [9] DT_0.19 cowplot_1.1.1 ggtree_3.0.4 phytools_0.7-90
## [13] maps_3.4.0 phangorn_2.7.1 tidytree_0.3.5 phylotools_0.2.2
## [17] ape_5.5 treeio_1.16.2 lubridate_1.7.10 reticulate_1.22
## [21] knitr_1.36 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7
## [25] purrr_0.3.4 readr_2.0.2 tidyr_1.1.4 tibble_3.1.4
## [29] ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] ellipsis_0.3.2 fs_1.5.0 aplot_0.1.1
## [4] rstudioapi_0.13 farver_2.1.0 fansi_0.5.0
## [7] xml2_1.3.2 codetools_0.2-18 mnormt_2.0.2
## [10] jsonlite_1.7.2 broom_0.7.9 dbplyr_2.1.1
## [13] png_0.1-7 compiler_4.1.1 httr_1.4.2
## [16] backports_1.2.1 assertthat_0.2.1 Matrix_1.3-4
## [19] fastmap_1.1.0 lazyeval_0.2.2 cli_3.0.1
## [22] htmltools_0.5.2 tools_4.1.1 igraph_1.2.6
## [25] coda_0.19-4 gtable_0.3.0 glue_1.4.2
## [28] clusterGeneration_1.3.7 fastmatch_1.1-3 Rcpp_1.0.7
## [31] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8
## [34] svglite_2.0.0 nlme_3.1-153 xfun_0.26
## [37] rvest_1.0.1 lifecycle_1.0.1 scales_1.1.1
## [40] hms_1.1.1 parallel_4.1.1 expm_0.999-6
## [43] yaml_2.2.1 ggfun_0.0.4 yulab.utils_0.0.2
## [46] sass_0.4.0 stringi_1.7.4 highr_0.9
## [49] plotrix_3.8-2 rlang_0.4.11 pkgconfig_2.0.3
## [52] systemfonts_1.0.2 evaluate_0.14 lattice_0.20-45
## [55] labeling_0.4.2 patchwork_1.1.1 htmlwidgets_1.5.4
## [58] tidyselect_1.1.1 magrittr_2.0.1 R6_2.5.1
## [61] generics_0.1.0 combinat_0.0-8 DBI_1.1.1
## [64] pillar_1.6.3 haven_2.4.3 withr_2.4.2
## [67] scatterplot3d_0.3-41 modelr_0.1.8 crayon_1.4.1
## [70] utf8_1.2.2 tmvnsim_1.0-2 tzdb_0.1.2
## [73] rmarkdown_2.11 grid_4.1.1 readxl_1.3.1
## [76] reprex_2.0.1 digest_0.6.28 webshot_0.5.2
## [79] numDeriv_2016.8-1.1 gridGraphics_0.5-1 munsell_0.5.0
## [82] beeswarm_0.4.0 ggplotify_0.1.0 vipor_0.4.5
## [85] bslib_0.3.0 quadprog_1.5-8